# Disappointed Expectations: Downward Mobility and Electoral Change
# Thomas Kurer and Briitta van Staalduinen
# American Political Science Review

# Main Analysis
# Using SOEP v35 2018 wave (main analysis) and SOEPlong (descriptives)
# Preparation

dev.off()
cat("\014")  
# globals
options(scipen=999)

# packages

if (!require("pacman")) install.packages("pacman") 

pacman::p_load(magrittr, tidyverse, broom, hrbrthemes, plm, estimatr, sandwich, lmtest, AER, lfe, huxtable, margins, readstata13, texreg, reshape2, readxl, xtablem, interplot)

library(coefplot)
library(randomForest)
library(interactions)
library(glmnet) # lasso and ridge regression for prediction
library(LARF)   # contains function for generating polynomials and interactions
library(hdm)    # lasso-based double selection for causal inference
library(caret)  # contains command for confusion matrix
library(randomForest) # random forests
library(rpart)        # decision trees
library(rpart.plot)   # library for plotting decision trees
library(grf)
library(SuperLearner)
library(truncnorm)

# ggplot theme
theme_set(theme_bw())

# define paths

datpath <- "..insert path.."
outpath <- "..insert path.."

# set seed
RNGkind(sample.kind = "Rounding")
# ensures consistent seeds across R versions
set.seed(1115)

# DATA PREPARATION ----

# soep 2018 (data main analysis) ----
soep <- read.dta13(paste0(datpath, "soep_pairs2018.dta"), convert.factors=FALSE, encoding="UTF-8")

# some recodings/cleaning
soep$bula[soep$bula<0] <- NA
# Thuringia, Saxony-Anhalt, Saxony, Mecklenburg, Brandenburg
soep$east <- ifelse(soep$bula %in% c(12, 13, 14, 15, 16), 1, 0)
soep$east[is.na(soep$bula)] <- NA

soep$birtheast <- ifelse(soep$birthregion %in% c(12, 13, 14, 15, 16), 1, 0)
soep$birtheast[is.na(soep$birthregion)] <- NA

# make dummified region1989
soep$east1989 <- ifelse(soep$region1989==1, 1, 0)
soep$west1989 <- ifelse(soep$region1989==2, 1, 0)
soep$abroad1989 <- ifelse(soep$region1989==3, 1, 0)
soep$bornafter1989 <- ifelse(soep$region1989==4, 1, 0)

# make dummified childloc
soep$loc_largecity <- ifelse(soep$childloc==1,1,0)
soep$loc_medcity <- ifelse(soep$childloc==2,1,0)
soep$loc_smallcity <- ifelse(soep$childloc==3,1,0)
soep$loc_cntryside <- ifelse(soep$childloc==4,1,0)

# make dummified sameloc

soep$sameloc_still <- ifelse(soep$sameloc==1,1,0)
soep$sameloc_again <- ifelse(soep$sameloc==2,1,0)
soep$sameloc_no <- ifelse(soep$sameloc==3,1,0)

# make dummified bula_child and bula_youth

for(i in c(2:16)) { # leave 1 out as reference
 n <- ncol(soep)+1
 nam <- paste("bula_child", i, sep = "")
 dummy <- assign(nam, ifelse(soep$bula_child==i, 1,0))
 soep <- cbind(soep, dummy)
 colnames(soep)[n] <- paste0("bula_child", i)
}

for(i in c(2:16)) { # leave 1 out as reference
 n <- ncol(soep)+1
 nam <- paste("bula_youth", i, sep = "")
 dummy <- assign(nam, ifelse(soep$bula_youth==i, 1,0))
 soep <- cbind(soep, dummy)
 colnames(soep)[n] <- paste0("bula_youth", i)
}

# other recodings for analysis

soep$fisco1d <- as.numeric(substring(soep$fisco88,1,1))

for(i in c(2:9)) { # leave 1 out as reference
 n <- ncol(soep)+1
 nam <- paste("fisco1d", i, sep = "")
 dummy <- assign(nam, ifelse(soep$fisco1d==i, 1,0))
 soep <- cbind(soep, dummy)
 colnames(soep)[n] <- paste0("fisco1d", i)
}

soep$hs <- ifelse(soep$edu>4,1,0)
soep$hs[is.na(soep$edu)] <- NA

soep$ybirth_full <- soep$ybirth
soep$mback <- ifelse(soep$migback==1, 0, 1)
soep$female <- factor(soep$female)

soep$empl <- NA
soep$empl[soep$empstat==1] <- 1
soep$empl[soep$empstat==2] <- 2
soep$empl[soep$empstat==3] <- 3
soep$empl[soep$empstat==4] <- 4
soep$empl[soep$empstat==9] <- 5
soep$empl[soep$empstat==5 | soep$empstat==6 |soep$empstat==7 |soep$empstat==8 |soep$empstat==10] <- 6

soep$alf <- NA
soep$alf[soep$empstat==9] <- 0
soep$alf[soep$empstat==1 | soep$empstat==2 |soep$empstat==4] <- 1
soep$alf[soep$empstat==3 | soep$empstat==5 | soep$empstat==6 |soep$empstat==7 |soep$empstat==8 |soep$empstat==10] <- 2


soep$anti <- ifelse(soep$afd==1 | soep$linke==1 | soep$partyid==7 | soep$partyid==26, 1, 0)
soep$anti[is.na(soep$partyid)] <- NA

soep$noid <- ifelse(soep$partyid_yn == 0, 1, 0)
soep$noid[is.na(soep$partyid_yn)] <- NA

soep$vanti <- ifelse(soep$vafd==1 | soep$vlinke==1 | soep$partyvote==7 | soep$partyvote==26, 1, 0)
soep$vanti[is.na(soep$partyvote)] <- NA

soep$age <- soep$syear-soep$ybirth

soep$vmainstream1 <- ifelse(soep$partyvote<5, 1, 0)
soep$vmainstream1[is.na(soep$partyvote)] <- NA

soep$vmainstream2 <- ifelse(soep$partyvote<6, 1, 0)
soep$vmainstream2[is.na(soep$partyvote)] <- NA

# load and merge auxiliary variables that are only asked irregularly
union <- read.dta13(paste0(datpath, "union.dta"), convert.factors=FALSE, encoding="UTF-8")
church <- read.dta13(paste0(datpath, "church.dta"), convert.factors=FALSE, encoding="UTF-8")

soep <- merge(soep, union, by.x="persnr", by.y="pid", all.x=T)
soep <- merge(soep, church, by.x="persnr", by.y="pid", all.x=T)

# merging of datasets created some duplicated observations. remove.
soep %<>% dplyr::select(-relhh)
soep <- unique(soep)


# soeplong (train data, data for longitudinal descriptives) ----
soeplong <- read.csv(paste0(datpath, "soeplong_prediction.csv"), stringsAsFactors = FALSE)

# some recodings/cleaning

# impute ybirth in missing years
soeplong %<>% group_by(pid) %>% mutate(ybirth_full=floor(mean(ybirth, na.rm=T))) %>% ungroup()
soeplong$age <- soeplong$syear-soeplong$ybirth_full

# make numeric bula
soeplong$bula_label <- soeplong$bula
soeplong$bula <- as.numeric(gsub("[^[:digit:].]", "",  soeplong$bula_label))
soeplong$bula[soeplong$bula_label=="[-2] trifft nicht zu"] <- NA

# make dummified region1989
soeplong$east1989 <- ifelse(soeplong$region1989=="east", 1, 0)
soeplong$west1989 <- ifelse(soeplong$region1989=="west", 1, 0)
soeplong$abroad1989 <- ifelse(soeplong$region1989=="abroad", 1, 0)
soeplong$bornafter1989 <- ifelse(soeplong$region1989=="born after 1989", 1, 0)

# make dummified childloc
soeplong$loc_largecity <- ifelse(soeplong$childloc==1,1,0)
soeplong$loc_medcity <- ifelse(soeplong$childloc==2,1,0)
soeplong$loc_smallcity <- ifelse(soeplong$childloc==3,1,0)
soeplong$loc_cntryside <- ifelse(soeplong$childloc==4,1,0)

# make dummified sameloc

soeplong$sameloc_still <- ifelse(soeplong$sameloc==1,1,0)
soeplong$sameloc_again <- ifelse(soeplong$sameloc==2,1,0)
soeplong$sameloc_no <- ifelse(soeplong$sameloc==3,1,0)

# make dummified bula_child and bula_youth

for(i in c(2:16)) { # leave 1 out as reference
 n <- ncol(soeplong)+1
 nam <- paste("bula_child", i, sep = "")
 dummy <- assign(nam, ifelse(soeplong$bula_child==i, 1,0))
 soeplong <- cbind(soeplong, dummy)
 colnames(soeplong)[n] <- paste0("bula_child", i)
}

for(i in c(2:16)) { # leave 1 out as reference
 n <- ncol(soeplong)+1
 nam <- paste("bula_youth", i, sep = "")
 dummy <- assign(nam, ifelse(soeplong$bula_youth==i, 1,0))
 soeplong <- cbind(soeplong, dummy)
 colnames(soeplong)[n] <- paste0("bula_youth", i)
}

soeplong$female <- factor(soeplong$female)

soeplong$mback <- ifelse(soeplong$migback=="[1] kein Migrationshintergrund", 0, 1)

soeplong$fisco1d <- as.numeric(substring(soeplong$fisco88,2,2))

for(i in c(2:9)) { # leave 1 out as reference
 n <- ncol(soeplong)+1
 nam <- paste("fisco1d", i, sep = "")
 dummy <- assign(nam, ifelse(soeplong$fisco1d==i, 1,0))
 soeplong <- cbind(soeplong, dummy)
 colnames(soeplong)[n] <- paste0("fisco1d", i)
}

write.csv(soeplong, paste0(datpath, "data_appendix_long.csv"))

# PREDICT OCCUPATIONAL STATUS: MODEL OPTIMIZATION ----

# train model ----

# training set: random sample of 2006-2010 soep respondents who are not part of 2018 wave (test/analysis data)

traintest_full <- soeplong %>% dplyr::select(pid, syear, ybirth_full, female, isei88_full, german, mback, childloc, contains("1989"), starts_with("loc_"), starts_with("bula_youth"), grades, starts_with("f")) %>% 
  filter(syear==2006 | syear==2007 | syear==2008 | syear==2009 | syear==2010) # before sample re-fresh in 2011


traintest_full$nrmiss <- rowSums(is.na(traintest_full))

# if varying nr of missing values over years, keep year with minimal nr of missings by pid
traintest_full %<>% group_by(pid) %>% filter(nrmiss==min(nrmiss)) %>% ungroup()

# randomly draw one observation by pid  
traintest_full %<>% group_by(pid) %>% sample_n(1) %>%
  dplyr::select(-fgerman, -nrmiss, -childloc, -fisei88, -fisco88, -fisco1d) %>% # drop unused variables
  filter(!is.na(ybirth_full)) %>% # drop observations with only household grid answered
  ungroup()

colSums(is.na(traintest_full))

# rf sample

traintest_full$sep <- ifelse(traintest_full$pid %in% soep$persnr, 0, 1)
traintest_full %<>% filter(sep==1)

colSums(is.na(traintest_full))

# rf1: include full set of potential predictors

traintest1 <- traintest_full %>% dplyr::select(-pid, -syear, -region1989, -sep, -abroad1989, -loc_cntryside, -bula_youth)
traintest1 <- na.omit(traintest1)
traintest1 <- data.frame(traintest1)


sample1 <- sample.int(n = nrow(traintest1), size = floor(.75*nrow(traintest1)), replace = FALSE)
train1 <- traintest1[sample1, ]
test1  <- traintest1[-sample1, ]

xtrain1=data.frame(train1[,-3])
xtest1=data.frame(test1[,-3])
ytrain1=train1[,3]
ytest1=test1[,3]

# mtry = 10, ntree=500
rf1_500_10=randomForest(ytrain1~., data=xtrain1, mtry=10, ntree=500)
rf1_500_10

varImpPlot(rf1_500_10, type=2)
yhattest_rf1_500_10=predict(rf1_500_10, newdata=test1)
mse_rf1_500_10 <- mean((ytest1-yhattest_rf1_500_10)^2)


# mtry = 5, ntree=500
rf1_500_5=randomForest(ytrain1~., data=xtrain1, mtry=5, ntree=500)
rf1_500_5

varImpPlot(rf1_500_5, type=2)
yhattest_rf1_500_5=predict(rf1_500_5, newdata=test1)
mse_rf1_500_5 <- mean((ytest1-yhattest_rf1_500_5)^2)

# mtry = 4, ntree=500

rf1_500_4=randomForest(ytrain1~., data=xtrain1, mtry=4, ntree=500)
rf1_500_4

varImpPlot(rf1_500_4, type=2)
yhattest_rf1_500_4=predict(rf1_500_4, newdata=test1)
mse_rf1_500_4 <- mean((ytest1-yhattest_rf1_500_4)^2)

# mtry = 3, ntree=500

rf1_500_3=randomForest(ytrain1~., data=xtrain1, mtry=3, ntree=500)
rf1_500_3

varImpPlot(rf1_500_3, type=2)
yhattest_rf1_500_3=predict(rf1_500_3, newdata=test1)
mse_rf1_500_3 <- mean((ytest1-yhattest_rf1_500_3)^2)

# mtry = 2, ntree=500

rf1_500_2=randomForest(ytrain1~., data=xtrain1, mtry=2, ntree=500)
rf1_500_2

varImpPlot(rf1_500_2, type=2)
yhattest_rf1_500_2=predict(rf1_500_2, newdata=test1)
mse_rf1_500_2 <- mean((ytest1-yhattest_rf1_500_2)^2)

# mtry = 10, ntree=1000
rf1_1000_10=randomForest(ytrain1~., data=xtrain1, mtry=10, ntree=1000)
rf1_1000_10

varImpPlot(rf1_1000_10, type=2)
yhattest_rf1_1000_10=predict(rf1_1000_10, newdata=test1)
mse_rf1_1000_10 <- mean((ytest1-yhattest_rf1_1000_10)^2)

# mtry = 5, ntree=1000
rf1_1000_5=randomForest(ytrain1~., data=xtrain1, mtry=5, ntree=1000)
rf1_1000_5

varImpPlot(rf1_1000_5, type=2)
yhattest_rf1_1000_5=predict(rf1_1000_5, newdata=test1)
mse_rf1_1000_5 <- mean((ytest1-yhattest_rf1_1000_5)^2)

# mtry = 4, ntree=1000

rf1_1000_4=randomForest(ytrain1~., data=xtrain1, mtry=4, ntree=1000)
rf1_1000_4

varImpPlot(rf1_1000_4, type=2)
yhattest_rf1_1000_4=predict(rf1_1000_4, newdata=test1)
mse_rf1_1000_4 <- mean((ytest1-yhattest_rf1_1000_4)^2)

# mtry = 3, ntree=1000

rf1_1000_3=randomForest(ytrain1~., data=xtrain1, mtry=3, ntree=1000)
rf1_1000_3

varImpPlot(rf1_1000_3, type=2)
yhattest_rf1_1000_3=predict(rf1_1000_3, newdata=test1)
mse_rf1_1000_3 <- mean((ytest1-yhattest_rf1_1000_3)^2)

# mtry = 2, ntree=1000

rf1_1000_2=randomForest(ytrain1~., data=xtrain1, mtry=2, ntree=1000)
rf1_1000_2

varImpPlot(rf1_1000_2, type=2)
yhattest_rf1_1000_2=predict(rf1_1000_2, newdata=test1)
mse_rf1_1000_2 <- mean((ytest1-yhattest_rf1_1000_2)^2)

# drop grades and bula_child (high missingness but some explanatory power)
# this is the "reduced model" specification detailed in the Supplementary Material

traintest2 <- traintest_full %>% dplyr::select(-pid, -syear, -region1989, -sep, -abroad1989, -loc_cntryside, -grades, -starts_with("bula_youth"))
traintest2 <- na.omit(traintest2)
traintest2 <- data.frame(traintest2)


sample2 <- sample.int(n = nrow(traintest2), size = floor(.75*nrow(traintest2)), replace = FALSE)
train2 <- traintest2[sample2, ]
test2  <- traintest2[-sample2, ]

xtrain2=data.frame(train2[,-3])
xtest2=data.frame(test2[,-3])
ytrain2=train2[,3]
ytest2=test2[,3]

# mtry = 5, ntree=500
rf2_500_5=randomForest(ytrain2~., data=xtrain2, mtry=5, ntree=500)
rf2_500_5

varImpPlot(rf2_500_5, type=2)
yhattest_rf2_500_5=predict(rf2_500_5, newdata=test2)
mse_rf2_500_5 <- mean((ytest2-yhattest_rf2_500_5)^2)

# mtry = 4, ntree=500

rf2_500_4=randomForest(ytrain2~., data=xtrain2, mtry=4, ntree=500)
rf2_500_4

varImpPlot(rf2_500_4, type=2)
yhattest_rf2_500_4=predict(rf2_500_4, newdata=test2)
mse_rf2_500_4 <- mean((ytest2-yhattest_rf2_500_4)^2)

# mtry = 3, ntree=500

rf2_500_3=randomForest(ytrain2~., data=xtrain2, mtry=3, ntree=500)
rf2_500_3

varImpPlot(rf2_500_3, type=2)
yhattest_rf2_500_3=predict(rf2_500_3, newdata=test2)
mse_rf2_500_3 <- mean((ytest2-yhattest_rf2_500_3)^2)

# mtry = 2, ntree=500

rf2_500_2=randomForest(ytrain2~., data=xtrain2, mtry=2, ntree=500)
rf2_500_2

varImpPlot(rf2_500_2, type=2)
yhattest_rf2_500_2=predict(rf2_500_2, newdata=test2)
mse_rf2_500_2 <- mean((ytest2-yhattest_rf2_500_2)^2)


# mtry = 5, ntree=1000
rf2_1000_5=randomForest(ytrain2~., data=xtrain2, mtry=5, ntree=1000)
rf2_1000_5

varImpPlot(rf2_1000_5, type=2)
yhattest_rf2_1000_5=predict(rf2_1000_5, newdata=test2)
mse_rf2_1000_5 <- mean((ytest2-yhattest_rf2_1000_5)^2)

# mtry = 4, ntree=1000

rf2_1000_4=randomForest(ytrain2~., data=xtrain2, mtry=4, ntree=1000)
rf2_1000_4

varImpPlot(rf2_1000_4, type=2)
yhattest_rf2_1000_4=predict(rf2_1000_4, newdata=test2)
mse_rf2_1000_4 <- mean((ytest2-yhattest_rf2_1000_4)^2)

# mtry = 3, ntree=1000

rf2_1000_3=randomForest(ytrain2~., data=xtrain2, mtry=3, ntree=1000)
rf2_1000_3

varImpPlot(rf2_1000_3, type=2)
yhattest_rf2_1000_3=predict(rf2_1000_3, newdata=test2)
mse_rf2_1000_3 <- mean((ytest2-yhattest_rf2_1000_3)^2)

# mtry = 2, ntree=1000

rf2_1000_2=randomForest(ytrain2~., data=xtrain2, mtry=2, ntree=1000)
rf2_1000_2

varImpPlot(rf2_1000_2, type=2)
yhattest_rf2_1000_2=predict(rf2_1000_2, newdata=test2)
mse_rf2_1000_2 <- mean((ytest2-yhattest_rf2_1000_2)^2)

# robustness: compare to ensemble model

# Super Learner averages over several machine learning methods to find optimal prediction 
# 15-fold cross-validation is used
sl1=SuperLearner(Y=ytrain1,X=xtrain1, family=gaussian, SL.library=c("SL.mean", "SL.cforest",  "SL.ipredbagg", "SL.nnet", "SL.glmnet", "SL.ksvm" ), cvControl = list(V = 15) )


sl1$times$everything
predictions_sl1 = predict(sl1, xtest1, onlySL = TRUE) # save all predictions
yhattest_sl1 <- as.vector(predictions_sl1$pred) # use weighted ensemble prediction

qplot(yhattest_sl1) + theme_minimal()
qplot(ytest1, yhattest_sl1) + theme_minimal()

# mean squared error in test data
mse_sl1=mean((ytest1-yhattest_sl1)^2)
mse_sl1

SL.rf.optimal = function(...) {
  SL.randomForest(..., num.trees = 1000, mtry=4)
}

cv_sl = CV.SuperLearner(Y = ytrain1, X = xtrain1, family = gaussian, V = 3,
                        SL.library = c("SL.mean", "SL.glmnet", "SL.ksvm", "SL.cforest", "SL.rf.optimal"))
summary(cv_sl)
plot(cv_sl) + theme(text = element_text(size=24))
ggsave(paste0(outpath, "FigureA4_SuperLearner_eval.pdf"))

# reduced vars, larger sample

sl2=SuperLearner(Y=ytrain2,X=xtrain2, newX=xtest2, family=gaussian, SL.library=c("SL.cforest",  "SL.ipredbagg", "SL.nnet", "SL.glmnet", "SL.ksvm" ), cvControl = list(V = 15) )

sl2$times$everything
predictions_sl2 = predict(sl2, xtest2, onlySL = TRUE) # save all predictions
yhattest_sl2 <- as.vector(predictions_sl2$pred) # use weighted ensemble prediction

qplot(yhattest_sl2) + theme_minimal()
qplot(ytest2, yhattest_sl2) + theme_minimal()

# mean squared error in test data
mse_sl2=mean((ytest2-yhattest_sl2)^2)
mse_sl2

# evaluate model performance ----

compare <- cbind(mse_rf1_1000_2, mse_rf1_1000_3, mse_rf1_1000_4, mse_rf1_1000_5, mse_rf1_1000_10, mse_rf1_500_2, mse_rf1_500_3, mse_rf1_500_4, mse_rf1_500_5, mse_rf1_500_10, mse_rf2_1000_2, mse_rf2_1000_3, mse_rf2_1000_4, mse_rf2_1000_5, mse_rf2_500_2, mse_rf2_500_3, mse_rf2_500_4, mse_rf2_500_5, mse_sl1, mse_sl2)
compare <- data.frame(compare)
compare %<>% pivot_longer(cols = everything(), names_to="model", values_to="mse")
compare$type <- substr(compare$model, 5,6)
compare$ntree <- as.numeric(gsub("_", "", substr(compare$model, 8,12)))
compare$varbysplit <- as.numeric(gsub("_", "", str_sub(compare$model, -2, -1)))
compare$dat <- as.numeric(str_sub(compare$model,7,7))
compare$npredictors <- ifelse(compare$dat==1, "extended", "reduced")
compare$model <- ifelse(compare$type=="rf", "RandomForest", "Ensemble")

compare %<>% dplyr::select(model, npredictors, ntree, varbysplit, mse) %>% arrange(mse)
compare

print(xtable(compare, type = "latex"), file =paste0(outpath,"TableA1_model_comparison_mse.tex"))

result <- data.frame(cbind(ytest2, yhattest_sl2, yhattest_rf2_500_3))
names(result) <-c("ytest", "ensemble", "randomforest")

result_long <- result %>% pivot_longer(!ytest, names_to="class", values_to="prediction")


setEPS()
postscript(paste0(outpath, "FigureA2_varImpPlot_full.eps"))
varImpPlot(rf1_1000_3, main="Random forest variable importance plot")
dev.off()

setEPS()
postscript(paste0(outpath, "FigureA3_varImpPlot_red.eps"))
varImpPlot(rf2_1000_4, main="Random forest variable importance plot")
dev.off()



# ISD measure ----
# create predictions in SOEP 2018

soep$isei_pred=predict(rf2_1000_3, newdata=soep)

soep$isd <- soep$isei_pred-soep$isei88_full

soep$isei_pred2=predict(rf1_1000_4, newdata=soep)

soep$isd2 <- soep$isei_pred2-soep$isei88_full

# ranking version (percentiles)

soep <- soep %>% mutate(isei100 = ntile(isei88_full, 100), 
                        isei_pred100 = ntile(isei_pred, 100), isei_pred2100 = ntile(isei_pred2, 100),
                        isd100= ntile(isd, 100), isd2100 = ntile(isd2, 100))

write.csv(soep, paste0(datpath, "data_main.csv"))

# Validation ----

# include mother's status

traintest_m_full <- soeplong %>% dplyr::select(pid, syear, ybirth_full, female, isei88_full, german, mback, childloc, contains("1989"), starts_with("loc_"), starts_with("bula_youth"), grades, starts_with("f"), misei88_full) %>% 
  filter(syear==2006 | syear==2007 | syear==2008 | syear==2009 | syear==2010) # before sample re-fresh in 2011

traintest_m_full$female <- as.numeric(as.character(traintest_m_full$female))
traintest_m_full$nrmiss <- rowSums(is.na(traintest_m_full))

# if varying nr of missing values over years, keep year with minimal nr of missings by pid
traintest_m_full %<>% group_by(pid) %>% filter(nrmiss==min(nrmiss)) %>% ungroup()

# randomly draw one observation by pid  
traintest_m_full %<>% group_by(pid) %>% sample_n(1) %>%
  dplyr::select(-fgerman, -nrmiss, -childloc, -fisei88, -fisco88, -fisco1d) %>% # drop unused variables
  filter(!is.na(ybirth_full)) %>% # drop observations with only household grid answered
  ungroup()

colSums(is.na(traintest_m_full))

# rf sample

traintest_m_full$sep <- ifelse(traintest_m_full$pid %in% soep$persnr, 0, 1)
traintest_m_full %<>% filter(sep==1)

colSums(is.na(traintest_m_full))

# rf1: include full set of potential predictors

traintest1m <- traintest_m_full %>% dplyr::select(-pid, -syear, -region1989, -sep, -abroad1989, -loc_cntryside, -bula_youth)
traintest1m <- na.omit(traintest1m)
traintest1m <- data.frame(traintest1m)


sample1m <- sample.int(n = nrow(traintest1m), size = floor(.75*nrow(traintest1m)), replace = FALSE)
train1m <- traintest1m[sample1m, ]
test1m  <- traintest1m[-sample1m, ]

xtrain1m=data.frame(train1m[,-3])
xtest1m=data.frame(test1m[,-3])
ytrain1m=train1m[,3]
ytest1m=test1m[,3]

# mtry = 4, ntree=1000

rf1_1000_4m=randomForest(ytrain1m~., data=xtrain1m, mtry=4, ntree=1000)
rf1_1000_4m

soep$isei_pred2m=predict(rf1_1000_4m, newdata=soep)

cor <- round(cor(soep$isei_pred2, soep$isei_pred2m, use="complete")[1],3)
par(pty="s")
pdf(paste0(outpath, "FigureA1_mothers.pdf") )
plot(soep$isei_pred2, soep$isei_pred2m,
     xlab="Prediction based on Father's Occupational Status",
     ylab="Prediction based on Father's and Mother's Occ. Status",
     cex.axis=1.5, cex.lab=1.5,
     asp=1)
text(35, 65, paste0("Correlation: ", cor))
abline(coef = c(0,1))
dev.off()

# correlation with satisfaction

covarsv <- paste(c("isd2", "factor(edu)", "female", "age", "mback", "factor(alf)", "factor(sameloc)", "log(incomem)", "factor(bula)", "factor(region1989)"),
                collapse="+")

covarsvstd <- paste(c("scale(isd2)", "factor(edu)", "female", "scale(age)", "mback", "factor(alf)", "factor(sameloc)", "scale(log(incomem))", "factor(bula)", "factor(region1989)"),
                collapse="+")

v1 <- lm(paste("satlifenow~", covarsv, sep=""), data=soep)
v2 <- lm(paste("satlifein1y~", covarsv, sep=""), data=soep)
v3 <- lm(paste("satlifein5y~", covarsv, sep=""), data=soep)

v4 <- lm(paste("satlifenow~", covarsvstd, sep=""), data=soep)
v5 <- lm(paste("satlifein1y~", covarsvstd, sep=""), data=soep)
v6 <- lm(paste("satlifein5y~", covarsvstd, sep=""), data=soep)

screenreg(list(v1,v2,v3,v4,v5,v6))

texreg(list(v4, v5, v6), digits=3, 
       label="tab:reg_validation_satisfaction", caption="Validation against Life Satisfaction", caption.above=TRUE,
       custom.model.names=c("Satisf. now", "Satisf. in 1 Year", "Satisf. in 5 Years"), 
       custom.coef.map = list('scale(isd2)' = 'Status Discordance (standardized)',
                              'scale(age)'='Age (standardized)',
                              'scale(log(incomem))' = 'Income (log, standardized)',
                              'female1' = 'Female (1=yes)',
                              'mback' = 'Migration Background (1=yes)',
                              'factor(edu)1' = 'Educ: Elementary',
                              'factor(edu)2' = 'Educ: Lower Second.',
                              'factor(edu)3' = 'Educ: Secondary',
                              'factor(edu)4' = 'Educ: University Prep.',
                              'factor(edu)5' = 'Educ: Tertiary I',
                              'factor(edu)6' = 'Educ: Tertiary II',
                              'factor(alf)1' = 'Emp. Status: not in labor force',
                              'factor(alf)2' = 'Emp. Status: other',
                              'factor(sameloc)2' = 'Location: Returned',
                              'factor(sameloc)3' = 'Location: Moved Away',
                              'factor(region1989)2' = 'In 1989: West',
                              'factor(region1989)3' = 'In 1989: Abroad',
                              'factor(region1989)4' = 'In 1989: Born later',
                              '(Intercept)' = 'Intercept'),
       custom.note=paste("%stars. All models include covariates as in Table 1 and regional (Bundesland) fixed effects."),
       file=paste0(outpath, "TableA2_reg_validation_satisfaction.tex"))

# validation against verbalized expectations
# (non-representative!) subset of young labor market entrants in bioage17 data set is asked about career aspirations at age 17

mean(soep$age[!is.na(soep$byisei)])
table(soep$age[!is.na(soep$byisei)])

soep$iseidiff <- soep$byisei-soep$isei88_full

plot(soep$iseidiff, soep$isd2)
cor(soep$iseidiff, soep$isd2, use="complete")
cor(soep$iseidiff[soep$iseidiff!=0], soep$isd2[soep$iseidiff!=0], use="complete")
cor(soep$iseidiff[soep$iseidiff!=0&soep$bybwunja==1], soep$isd2[soep$iseidiff!=0&soep$bybwunja==1], use="complete")

mod1 <- lm(isd2~iseidiff, data=soep)
mod1sum <- summary(mod1)

mod2 <- lm(isd2~iseidiff, data=subset(soep, iseidiff!=0))
mod2sum <- summary(mod2)

soep$modcolor <- ifelse(soep$iseidiff==0, "gray", "black")

r21 = mod1sum$adj.r.squared
r22 = mod2sum$adj.r.squared

beta1 = mod1sum$coefficients[2,1]
beta2 = mod2sum$coefficients[2,1]

my.p1 = mod1sum$coefficients[2,4]
my.p2 = mod2sum$coefficients[2,4]


rp = vector('expression',3)
rp[1] = substitute(expression(italic(R)^2 == MYVALUE), 
		list(MYVALUE = format(r22,dig=3)))[2]
rp[2] = substitute(expression(beta == MYOTHERVALUE), 
		list(MYOTHERVALUE = format(beta2, digits = 3)))[2]
rp[3] = substitute(expression(italic(p) == MYTHIRDVALUE), 
		list(MYTHIRDVALUE = format("<0.000", digits=3)))[2]

pdf(paste0(outpath, "FigureA6_validation_plot.pdf"))

plot(soep$iseidiff, soep$isd, col=soep$modcolor, 
     xlab = 'Aspiration at age 17 vs. Realized',
     ylab = 'Intergenerational Status Discordance', cex.axis=1.5, cex.lab=1.5)
abline(mod2, col='black')
abline(mod1, col='gray')

legend('topright', legend = rp, bty = 'n')

dev.off()

# multivariate

val_mod1 <- lm(paste("iseidiff~", covarsv, sep=""), data=soep)
val_mod2 <- lm(paste("iseidiff~", covarsv, sep=""), data=subset(soep, iseidiff!=0))


texreg(list(val_mod1, val_mod2), digits=3, 
       label="tab:reg_validation", caption="Validation against verbalized expectations (multivariate)", caption.above=TRUE,
       custom.model.names=c("Full Model", "Dropping Zero Diff"), 
       custom.coef.map = list('isd2' = 'Status Discordance',
                              'female1' = 'Female (1=yes)',
                              'age'='Age',
                              'mback' = 'Migration Background (1=yes)',
                              'factor(edu)1' = 'Educ: Elementary',
                              'factor(edu)2' = 'Educ: Lower Second.',
                              'factor(edu)3' = 'Educ: Secondary',
                              'factor(edu)4' = 'Educ: University Prep.',
                              'factor(edu)5' = 'Educ: Tertiary I',
                              'factor(edu)6' = 'Educ: Tertiary II',
                              'factor(alf)1' = 'Emp. Status: not in labor force',
                              'factor(alf)2' = 'Emp. Status: other',
                              'log(incomem)' = 'Income (log)',
                              'factor(sameloc)2' = 'Location: Returned',
                              'factor(sameloc)3' = 'Location: Moved Away',
                              'factor(region1989)2' = 'In 1989: West',
                              'factor(region1989)3' = 'In 1989: Abroad',
                              'factor(region1989)4' = 'In 1989: Born later',
                              '(Intercept)' = 'Intercept'),
       custom.note=paste("%stars. Both models a full set of covariates as in Table 1 and regional (Bundesland) fixed effects."),
       file=paste0(outpath, "TableA3_reg_validation.tex"))

